originally made 9.18.19, updated on 10.22.19, updated on 12.11.19 # scatter plot ukb level mismap scorev1.0 with multimapping ENSP column counts
import os
import sys
import numpy as np
import pandas as pd
import csv
from ast import literal_eval
from statistics import mean
# plotting
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sys.path.append("/Users/mariapalafox/Desktop/Toolbox/")
from all_funx import *
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
display(HTML("<style>.container {width:90% !important;}</style>"))
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS")
print(os.listdir())
previously did this when i had total ENSP per ID instead of what i have now, unique # ENSP
# np.where can compare multiple columm rows values
# cleanest way is to check all columns against the first column
dfcount = df[['v97_ENSP_count', 'v85_ENSP_count',
'v92_ENSP_count', 'v94_ENSP_count', 'v96_ENSP_count']].copy()
dfcount.head(3)
v97_ENSP_count v85_ENSP_count v92_ENSP_count v94_ENSP_count v96_ENSP_count
0 2 2 2 2 2
1 1 1 1 1 1
2 2 3 2 2 2
dfcount.eq(dfcount.iloc[:,0],axis=0)
# Now you can use all (if they are all equal to the first item, they are all equal)
df['sameCountENSP'] = dfcount.eq(dfcount.iloc[:, 0], axis=0).all(axis=1)
prev. results
for uniprot IDs with consistent number of ENSP IDs linked, the avg frac_missed score is 0.15
for uniprot IDs that DO NOT have consistent number of ENSP IDs linked, the avg frac_missed score is 0.29
# dfc is counts version of df
def summarizeMultimapping(dfc,df):
mea = dfc.mean(axis=1)
st = dfc.std(axis=1)
su = dfc.sum(axis=1)
# ID and gene name from df version of data
xref = df.xref
gene = df.geneNamePrimary
dsum = pd.concat([xref, gene, su, mea, st],axis=1)
dsum.columns = ['xref','geneNamePrimary','totalENSPlinked','meanNumENSPlinked','stdENSPlinked']
dfinal = dsum.sort_values('totalENSPlinked')
return dfinal
dfallcounts = df[['v97_ENSP_count', 'v85_ENSP_count',
'v92_ENSP_count', 'v94_ENSP_count', 'v96_ENSP_count']].copy()
dall = summarizeMultimapping(dfallcounts, df)
# mapping these columns to df
ref_tot = dict(zip(dall.xref, dall.totalENSPlinked))
ref_mean = dict(zip(dall.xref, dall.meanNumENSPlinked))
ref_std = dict(zip(dall.xref, dall.stdENSPlinked))
# adding gene name column
df['total_ENSP_linked'] = df['xref']
df['mean_ENSP_linked'] = df['xref']
df['std_ENSP_linked'] = df['xref']
df.total_ENSP_linked = df.total_ENSP_linked.map(ref_tot)
df.mean_ENSP_linked = df.mean_ENSP_linked.map(ref_mean)
df.std_ENSP_linked = df.std_ENSP_linked.map(ref_std)
# improting uniprot gene name key to start adding genes names iwth IDs
genes = pd.read_csv("GENE_ID_UKB.csv")
genes.head(5)
genes.dropna(inplace = True)
# split gene col create new col
splitdf = genes["gene_names"].str.split(" ", n=1, expand=True)
splitdf_final = splitdf[0]
splitdf_final.columns = ['HGNC_gene']
genes = pd.concat([genes, splitdf_final], axis=1)
genes.columns = ['ID', 'GeneName_primary', 'HGNC_ID', 'HGNC_name']
genes.columns
#genes.to_csv("GENE_ID_KEY_NOEXCEL_OPENING.csv", index=False)
ref_gene = dict(zip(genes.ID, genes.HGNC_name))
allset = pd.read_csv("ALL_releases_set_counts_ID_types_3953.csv")
alllist = pd.read_csv("ALL_releases_list_stdev_ID_versions_3953.csv")
ultimate = pd.read_csv("ultimate_UKB_sequence_key_3953.csv")
mismap = pd.read_csv("MISMAP2.0/R_dynamic_slope_scores_3953.csv")
allset.columns = ['ID', 'ENSP', 'ENSPv', 'ENST', 'ENSTv', 'ENSG', 'ENSGv',
'stableID_key', 'proSequence', 'count_ENSP', 'count_ENSPv',
'count_ENST', 'count_ENSTv', 'count_ENSG', 'count_ENSGv',
'count_stableID_key', 'count_proSequence']
allset.set_index('ID', inplace=True)
allset.head(3)
alllist.columns = ['ID', 'Length', 'pro_ver', 'tx_ver', 'gen_ver', 'stdev_length',
'stdev_prov', 'stdev_txv', 'stdev_genv']
alllist.set_index('ID', inplace=True)
alllist.head(3)
ultimate = ultimate[['ID', 'labeled_pos_count']].copy()
ultimate.set_index('ID', inplace=True)
ultimate.head(3)
mismap.set_index('ID',inplace=True)
mismap.head(3)
mer = pd.concat([allset, alllist, ultimate, mismap], axis=1)
mer.head(3)
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")
v97s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_8861shared_stableIDkeys.csv")
v97a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_3953UKB.csv")
v97f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v97_1796UKBIDs.csv")
v96s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_8861shared_stableIDkeys.csv")
v96a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_3953UKB.csv")
v96f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v96_1796UKBIDs.csv")
v94s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_8861shared_stableIDkeys.csv")
v94a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_3953UKB.csv")
v94f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v94_1796UKBIDs.csv")
v92s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_8861shared_stableIDkeys.csv")
v92a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_3953UKB.csv")
v92f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v92_1796UKBIDs.csv")
v85s = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_8861shared_stableIDkeys.csv")
v85a = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_3953UKB.csv")
v85f = pd.read_csv("ENSEMBL_MISMAP_SCORED_v85_1796UKBIDs.csv")
dfs = [v85a, v92a, v94a, v96a, v97a]
for i in dfs:
print(i.shape)
dfs = [v85s, v92s, v94s, v96s, v97s]
for i in dfs:
print(i.shape)
dff = [v85f, v92f, v94f, v96f, v97f]
for i in dff:
print(i.shape)
print(v85a.columns)
print()
print(v85s.columns)
print()
print(v85f.columns)
frac missed = # positions missed / # of positions searched based on total per UKB ID
# adding column for release # before merging
v97a['release'] = 97
v96a['release'] = 96
v94a['release'] = 94
v92a['release'] = 92
v85a['release'] = 85
v97s['release'] = 97
v96s['release'] = 96
v94s['release'] = 94
v92s['release'] = 92
v85s['release'] = 85
v97f['release'] = 97
v96f['release'] = 96
v94f['release'] = 94
v92f['release'] = 92
v85f['release'] = 85
concat all release files with different #'s of ENSP (except shared sets) together
all_mer = pd.concat([v85a, v92a, v94a, v96a, v97a])
share_mer = pd.concat([v85s, v92s, v94s, v96s, v97s])
false_mer = pd.concat([v85f, v92f, v94f, v96f, v97f])
# adding gene name column
all_mer['HGNC_gene'] = all_mer['ID']
all_mer.HGNC_gene = all_mer.HGNC_gene.map(ref_gene)
all_mer.head(4)
# adding gene name column
share_mer['HGNC_gene'] = share_mer['ID']
share_mer.HGNC_gene = share_mer.HGNC_gene.map(ref_gene)
share_mer.head(4)
# adding gene name column
false_mer['HGNC_gene'] = false_mer['ID']
false_mer.HGNC_gene = false_mer.HGNC_gene.map(ref_gene)
false_mer.head(4)
print("shape of all 3953 in releases diff row count: ", all_mer.shape)
print("shape of shared keys in all releases same row count: ", share_mer.shape)
print("shape of false 1796 in releases diff row count: ", false_mer.shape)
share_mer.head(3)
# COPIED FROM ALL FUNCTIONS
# dynamic slope score for Rmerged (all releases axis0) files
def group_scores(df):
gdf = df.groupby('stableID_key')['hamming_normalized_dist'].apply(list)
gdf = pd.DataFrame(gdf, index=None)
gdf.reset_index(inplace=True)
print("group df shape: ", gdf.shape)
print(gdf.head(1))
return gdf
def mismaplines_dynamic(df, col):
# expanding this out to include dynamic levels - 1 2 3
diffline = []
for index, row in df.iterrows():
ukbid = row['stableID_key']
ls = row[col]
#python_ls = literal_eval(ls)
python_ls = ls
lenLS = len(set(python_ls))
if lenLS == 5:
diffline.append("5")
if lenLS == 4:
diffline.append("4")
if lenLS == 3:
diffline.append("3")
if lenLS == 2:
diffline.append("2")
if lenLS == 1:
diffline.append("1")
df.loc[:,'dynamic_slope_scores'] = diffline
print(df.shape)
#leven = group_scores(share_mer)
#hammin = group_scores(share_mer)
mismaplines_dynamic(leven, 'levenshtein_normalized_dist')
mismaplines_dynamic(hammin, 'hamming_normalized_dist')
leven.columns = ['stableID_key', 'levenshtein_normalized_dist', 'dynamic_slope_LEVENnorm']
hammin.columns = ['stableID_key', 'hamming_normalized_dist', 'dynamic_slope_HAMMINGnorm']
checkColumnValues(leven, 'dynamic_slope_LEVENnorm')
checkColumnValues(hammin, 'dynamic_slope_HAMMINGnorm')
leven[leven['dynamic_slope_LEVENnorm'] == '5']
# map leven score
levendict = dict(zip(leven.stableID_key, leven.dynamic_slope_LEVENnorm))
share_mer['dynamic_slope_LEVENnorm'] = share_mer['stableID_key']
share_mer.dynamic_slope_LEVENnorm = share_mer.dynamic_slope_LEVENnorm.map(levendict)
# map hamming score
hamdict = dict(zip(hammin.stableID_key, hammin.dynamic_slope_HAMMINGnorm))
share_mer['dynamic_slope_HAMMINGnorm'] = share_mer['stableID_key']
share_mer.dynamic_slope_HAMMINGnorm = share_mer.dynamic_slope_HAMMINGnorm.map(hamdict)
share_mer[share_mer['dynamic_slope_LEVENnorm'] == '5']
share_mer[share_mer['dynamic_slope_HAMMINGnorm'] == '5']
describeMe(share_mer)
share_mer.to_csv("LASER_dynamic_slope_leven_hamming_shared3887_44305rows.csv", index=False)
# ANOTHER FILE WITH LEVEN and HAMMING scores but not merged, columns added!
# dfs = [v85s, v92s, v94s, v96s, v97s]
# for i in dfs:
# print(i.sort_values(['stableID_key'], inplace=True))
# print(i.head(2))
# condf.columns = ['v85_levnorm', 'v92_levnorm', 'v94_levnorm', 'v96_levnorm', 'v97_levnorm']
# condf_ham.columns = ['v85_hamnorm', 'v92_hamnorm', 'v94_hamnorm', 'v96_hamnorm', 'v97_hamnorm']
# condf.head(2)
# condf.to_csv("LASER_LEVEN_NORM_shared_8861_ENSP.csv", index=False)
# condf_ham.to_csv("LASER_HAMMING_NORM_shared_8861_ENSP.csv", index=False)
EACH RELEASE has a frac-missed score calculated for all ENSP mapping to 3953.
GROUPBY version created for each release (only 3953 rows total)
RELEASE LEVEL SCORE: takes the fraction missed score for 3953 uniprot IDs in each release and averages these to create the release level average
only include unique ENSP sequences mapping to same 3953 UKB IDs.
each release file with diff rows same UKB IDs, group by ID, set(proSequence), calculate the fraction missed as
total number missed in all unique seqeunce mapping to 1 ukb ID / total positions searched per UKB ID * number sequences searched
plot these values, each release will have 3953 scores from only unique ensp sequences mapping.
release level would be created by averaging all fraction missed scores for 3953 in each release
# make python read col as list
mer.frac_missed = mer.frac_missed.apply(literal_eval)
mer['avg_frac_missed_releaseLevel'] = mer['frac_missed'].apply(lambda x: mean(x))
# creating all releases avg score
mer.reset_index(inplace=True)
mer.columns = ['ID', 'ENSP', 'ENSPv', 'ENST', 'ENSTv', 'ENSG', 'ENSGv',
'stableID_key', 'proSequence', 'count_ENSP', 'count_ENSPv',
'count_ENST', 'count_ENSTv', 'count_ENSG', 'count_ENSGv',
'count_stableID_key', 'count_proSequence', 'Length', 'pro_ver',
'tx_ver', 'gen_ver', 'stdev_length', 'stdev_prov', 'stdev_txv',
'stdev_genv', 'labeled_pos_count', 'frac_missed',
'dynamic_slope_scores', 'avg_frac_missed_releaseLevel']
mer.head(3)
# adding gene name column
mer['HGNC_gene'] = mer['ID']
mer.HGNC_gene = mer.HGNC_gene.map(ref_gene)
mer.head(2)
checkColumnValues(mer, 'dynamic_slope_scores')
checkColumnValues(mer, 'avg_frac_missed_releaseLevel')
worst17 = mer[mer['avg_frac_missed_releaseLevel'] == 1]
worst17.to_csv("WORST17_avg_frac_missed_allReleases_equals_1.csv", index=False)
# checking DMD gene
ultimate = pd.read_csv("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/ultimate_UKB_sequence_key_3953.csv")
DMDukb = ultimate[ultimate['ID'] == 'P11532']
DMDensp = worst17[worst17['ID'] == 'P11532']
DMDensp.to_csv("DMDensp_5releases_merge.csv")
DMDukb.to_csv("DMDukb_canonical.csv")
mer.describe()
all_mer.describe()
share_mer.describe()
false_mer.describe()
check = false_mer['hamming_normalized_dist'].value_counts()
check.to_csv("check_FALSE_distance_20720.csv")
mer.drop(['proSequence'], axis=1, inplace=True)
mer.to_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv",index=False)
all_mer.to_csv("ENSP_LEVEL_all3953_genename_distanceMetrics_52417rows.csv",index=False)
share_mer.to_csv("ENSP_LEVEL_shared3887_genename_distanceMetrics_44305rows.csv",index=False)
false_mer.to_csv("ENSP_LEVEL_false1796_genename_distanceMetrics_20720rows.csv",index=False)
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")
mer = pd.read_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv")
mer.head(2)
# TOP 25 for TX PRO GENE biotypes
#mer.sort_values(['stdev_prov'], ascending=False,inplace=True)
#mer.sort_values(['stdev_txv'], ascending=False,inplace=True)
#mer.sort_values(['stdev_genv'], ascending=False,inplace=True)
# grab top 25 for each sorted biotype file
#top25std = mer[0:25]
# saving
#top25std.to_csv("STD_ENSPv_top25_3953RELEASE_level.csv", index=False)
#top25std.to_csv("STD_ENSTv_top25_3953RELEASE_level.csv", index=False)
#top25std.to_csv("STD_ENSGv_top25_3953RELEASE_level.csv", index=False)
stable = pd.read_csv("ENSP_LEVEL_shared3887_genename_distanceMetrics_44305rows.csv")
stable.columns
slimstable = stable[['ENSPv', 'ENSTv', 'ENSGv', 'Length',
'stableID_key', 'ID', 'pro_ver', 'tx_ver', 'gen_ver',
'identical_2UKBseq', 'len_ensp_minus_ukb',
'hamming_normalized_dist',
'levenshtein_normalized_dist','release', 'HGNC_gene']].copy()
slimstable.sort_values(by=['stableID_key'], inplace=True)
slimstable.head(3)
def rangefinder(s):
maxx = max(s)
minn = min(s)
diff = maxx - minn
return diff
# group by stableID_key make 3 seperate columns from list of pro_ver, tx_ver, gene_ver
gene_stable = slimstable.groupby('stableID_key')['gen_ver'].apply(list)
pro_stable = slimstable.groupby('stableID_key')['pro_ver'].apply(list)
tx_stable = slimstable.groupby('stableID_key')['tx_ver'].apply(list)
# concat
list_columns = pd.concat([gene_stable, tx_stable, pro_stable], axis=1)
# creating column for number of ENSP ids
list_columns['range_gene_ver'] = list_columns['gen_ver'].apply(rangefinder)
list_columns['range_tx_ver'] = list_columns['tx_ver'].apply(rangefinder)
list_columns['range_pro_ver'] = list_columns['pro_ver'].apply(rangefinder)
list_columns.reset_index(inplace=True)
list_columns.head(3)
refhgnc = stable[['stableID_key','HGNC_gene']].copy()
refhgnc.drop_duplicates(inplace=True)
refhgnc.head(3)
# mapping these columns to df with dic
ref_hgnc = dict(zip(refhgnc.stableID_key, refhgnc.HGNC_gene))
# adding gene name column
list_columns['HGNC_gene'] = list_columns['stableID_key']
list_columns.HGNC_gene = list_columns.HGNC_gene.map(ref_hgnc)
list_columns.head(3)
list_columns.describe()
# TOP 25 for TX PRO GENE biotypes
#list_columns.sort_values(['range_gene_ver'], ascending=False,inplace=True)
#list_columns.sort_values(['range_tx_ver'], ascending=False,inplace=True)
list_columns.sort_values(['range_pro_ver'], ascending=False,inplace=True)
# grab top 25 for each sorted biotype file
top25 = list_columns[0:25]
# saving
#top25.to_csv("RANGE_ENSGv_top25_8861stableKEY_level.csv", index=False)
#top25.to_csv("RANGE_ENSTv_top25_8861stableKEY_level.csv", index=False)
top25.to_csv("RANGE_ENSPv_top25_8861stableKEY_level.csv", index=False)
top25
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/TSV_UNIPROT_xref/MULTIMAPPING_IDS/MISMAP2.0/")
mer = pd.read_csv("RELEASE_LEVEL_3953_avgFRACMISSED_genename_3953rows.csv")
specmer = pd.read_csv("ENSP_LEVEL_all3953_genename_distanceMetrics_52417rows.csv")
print(mer.shape)
print(specmer.shape)
# mer is 1 uniprot ID to many ENSP
# specmer is many ENSP to 1 UKB id
specmer_number = specmer[['hamming_normalized_dist', 'levenshtein_normalized_dist', 'missed_count','missed_frac',
'release']]
# Length diff is ENSP - UKB # last number is more constant
specmer_number.columns = ['HammingN', 'LevenshteinN', 'Missed count','Missed frac',
'Release']
mer_number = mer[['count_ENSP', 'count_ENSPv', 'count_ENST', 'count_ENSTv',
'count_ENSG', 'count_ENSGv', 'count_proSequence',
'dynamic_slope_scores', 'avg_frac_missed_releaseLevel']].copy()
mer_number.columns = ['ENSP count', 'ENSPv count', 'ENST count', 'ENSTv count',
'ENSG count', 'ENSGv count', 'ProSequence count',
'Mismap slope', 'Release mismap mean']
mer_number.head(2)
specmer_number.head(2)
# change dynamic to category
specmer_number['Release'] = pd.Categorical(specmer_number.Release)
#specmer_number.dtypes
from pandas import DataFrame
import seaborn as sn
corr1 = mer_number.corr()
mask = np.zeros_like(corr1)
mask[np.triu_indices_from(mask)] = True
with sn.axes_style("white"):
f, ax = plt.subplots(figsize=(7, 5))
ax =sn.heatmap(corr1, annot=True, annot_kws={"size": 6},vmin=0, vmax=1, mask=mask, square=True)
plt.subplots_adjust(top=1, bottom=0.5)
#ax.set_ylim(len(8)-0.5, -0.5)
# fix for mpl bug that cuts off top/bottom of seaborn viz
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
#plt.show()
plt.savefig('correlation_matrix_3953UKBIDs_multimapping_info.pdf', dpi=300, bbox_inches = "tight")
corr2 = specmer_number.corr()
mask = np.zeros_like(corr2)
mask[np.triu_indices_from(mask)] = True
with sn.axes_style("white"):
f, ax = plt.subplots(figsize=(8, 6))
ax = sn.heatmap(corr2, annot=True, annot_kws={"size": 6},vmin=0, vmax=1, mask=mask, square=True)
plt.subplots_adjust(top=1, bottom=0.5)
#ax.set_ylim(len(8)-0.5, -0.5)
# fix for mpl bug that cuts off top/bottom of seaborn viz
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=45)
#plt.show()
plt.savefig('correlation_matrix_ENSPIDs_52417distance2ukb_info.pdf', dpi=300, bbox_inches = "tight")
# numpy pandas figure
corr1 = mer_number.corr()
corr1.style.background_gradient(cmap='coolwarm').set_precision(2)
# https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
proseq = px.scatter(mer, x= "count_proSequence", y="avg_frac_missed_releaseLevel", color="avg_frac_missed_releaseLevel")
proseq.show()
corr2 = specmer_number.corr()
corr2.style.background_gradient(cmap='coolwarm').set_precision(2)
import plotly.graph_objects as go
corr2.write_image("specificrelease_correlation_matric.png")
specmer_dist = specmer[['len_ensp_minus_ukb', 'hamming_normalized_dist',
'levenshtein_normalized_dist', 'missed_frac','release']].copy()
fig = px.scatter_matrix(specmer_dist,
dimensions=['len_ensp_minus_ukb', 'hamming_normalized_dist',
'levenshtein_normalized_dist', 'missed_frac'],color="release")
fig.update_traces(diagonal_visible=False)
fig.show()
ham = px.scatter(specmer_number, x= "hamming_normalized_dist", y="missed_frac", color="release")
ham.show()
fig = px.scatter(specmer_number, x="hamming_normalized_dist", y="missed_frac", color="release", marginal_y="violin",
marginal_x="box")
fig.show()
dfnum.columns = ['dynamic slope', 'avg mismap', 'mean ENSP',
'SD ENSP', 'total pos', 'UKB C abun',
'UKB K abun', 'UKB length']
dfnum.columns = ['dynamic slope', 'avg mismap', 'mean ENSP',
'SD ENSP', 'total pos', 'UKB C abun',
'UKB K abun', 'UKB length']
fig3 = px.parallel_coordinates(dfnum, color="avg mismap",
color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=0.5)
fig3.show()
lines1 = px.parallel_coordinates(specmer_dist, color="release",
color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2)
lines1.show()